This code covers chapter 2 of “Introduction to Data Mining” by Pang-Ning Tan, Michael Steinbach and Vipin Kumar focusing on using tidyverse instead of base R. See table of contents for code examples for other chapters.

CC This work is licensed under the Creative Commons Attribution 4.0 International License. For questions please contact Michael Hahsler.

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.3     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(GGally) # for ggpairs
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa

Preparation

Load the iris data set and convert the data.frame into a tibble.

data(iris)
iris <- as_tibble(iris)
iris
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # … with 140 more rows

Data Quality

Inspect data (produce a scatterplot matrix using ggpairs from package GGally). Possibly you can see noise and ouliers.

ggpairs(iris, aes(color = Species))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Get summary statistics for each column (outliers, missing values)

summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

just the mean

iris %>% summarize_if(is.numeric, mean)
## # A tibble: 1 x 4
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
##          <dbl>       <dbl>        <dbl>       <dbl>
## 1         5.84        3.06         3.76        1.20

Often you will do something like:

clean.data <- iris %>% drop_na() %>% unique()
summary(clean.data)
##   Sepal.Length    Sepal.Width    Petal.Length    Petal.Width          Species  
##  Min.   :4.300   Min.   :2.00   Min.   :1.000   Min.   :0.100   setosa    :50  
##  1st Qu.:5.100   1st Qu.:2.80   1st Qu.:1.600   1st Qu.:0.300   versicolor:50  
##  Median :5.800   Median :3.00   Median :4.300   Median :1.300   virginica :49  
##  Mean   :5.844   Mean   :3.06   Mean   :3.749   Mean   :1.195                  
##  3rd Qu.:6.400   3rd Qu.:3.30   3rd Qu.:5.100   3rd Qu.:1.800                  
##  Max.   :7.900   Max.   :4.40   Max.   :6.900   Max.   :2.500

Note that one case (non-unique) is gone. All cases with missing values will also have been dropped.

Aggregation

Aggregate by species. First group the data and then summarize each group.

iris %>% group_by(Species) %>% summarize_all(mean)
## # A tibble: 3 x 5
##   Species    Sepal.Length Sepal.Width Petal.Length Petal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa             5.01        3.43         1.46       0.246
## 2 versicolor         5.94        2.77         4.26       1.33 
## 3 virginica          6.59        2.97         5.55       2.03
iris %>% group_by(Species) %>% summarize_all(median)
## # A tibble: 3 x 5
##   Species    Sepal.Length Sepal.Width Petal.Length Petal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa              5           3.4         1.5          0.2
## 2 versicolor          5.9         2.8         4.35         1.3
## 3 virginica           6.5         3           5.55         2

Sampling

Random sampling

s <- iris %>% sample_n(15)
ggpairs(s, aes(color = Species))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Stratified sampling

You need to install the package sampling with: install.packages(“sampling”)

library(sampling)
id2 <- strata(iris, stratanames="Species", size=c(5,5,5), method="srswor")
id2
##        Species ID_unit Prob Stratum
## 17      setosa      17  0.1       1
## 21      setosa      21  0.1       1
## 27      setosa      27  0.1       1
## 40      setosa      40  0.1       1
## 44      setosa      44  0.1       1
## 51  versicolor      51  0.1       2
## 54  versicolor      54  0.1       2
## 74  versicolor      74  0.1       2
## 83  versicolor      83  0.1       2
## 84  versicolor      84  0.1       2
## 119  virginica     119  0.1       3
## 120  virginica     120  0.1       3
## 121  virginica     121  0.1       3
## 124  virginica     124  0.1       3
## 150  virginica     150  0.1       3
s2 <- iris %>% slice(id2$ID_unit)
ggpairs(s2, aes(color = Species))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Features

Dimensionality reduction (Principal Components Analysis - PCA)

Interactive 3d plots (needs package plotly)

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(iris, x = ~Sepal.Length, y = ~Petal.Length, z = ~Sepal.Width,
  size = ~Petal.Width, color = ~Species, type="scatter3d",
 mode="markers")
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Calculate the principal components

pc <- prcomp(as.matrix(iris[,1:4]))

How important is each principal component?

plot(pc)

Inspect the raw object (display structure)

str(pc)
## List of 5
##  $ sdev    : num [1:4] 2.056 0.493 0.28 0.154
##  $ rotation: num [1:4, 1:4] 0.3614 -0.0845 0.8567 0.3583 -0.6566 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##   .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
##  $ center  : Named num [1:4] 5.84 3.06 3.76 1.2
##   ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ scale   : logi FALSE
##  $ x       : num [1:150, 1:4] -2.68 -2.71 -2.89 -2.75 -2.73 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
##  - attr(*, "class")= chr "prcomp"
ggplot(as_tibble(pc$x), aes(x = PC1, y = PC2, color = iris$Species)) + geom_point()

Plot the projected data and add the original dimensions as arrows (this can be done with ggplot2, but is currently painful; see https://stackoverflow.com/questions/6578355/plotting-pca-biplot-with-ggplot2).

biplot(pc, col = c("grey", "red"))

Feature selection

We will talk about feature selection when we discuss classification models.

Discretize features

ggplot(iris, aes(x = Petal.Width, y = 1:150)) + geom_point()

A histogram is a better visualization for the distribution of a single variable.

ggplot(iris, aes(Petal.Width)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Equal interval width

cut(iris$Sepal.Width, breaks=3)
##   [1] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (3.6,4.4] (2.8,3.6]
##   [8] (2.8,3.6] (2.8,3.6] (2.8,3.6] (3.6,4.4] (2.8,3.6] (2.8,3.6] (2.8,3.6]
##  [15] (3.6,4.4] (3.6,4.4] (3.6,4.4] (2.8,3.6] (3.6,4.4] (3.6,4.4] (2.8,3.6]
##  [22] (3.6,4.4] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6]
##  [29] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (3.6,4.4] (3.6,4.4] (2.8,3.6]
##  [36] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8]  
##  [43] (2.8,3.6] (2.8,3.6] (3.6,4.4] (2.8,3.6] (3.6,4.4] (2.8,3.6] (3.6,4.4]
##  [50] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8]   (2,2.8]   (2,2.8]  
##  [57] (2.8,3.6] (2,2.8]   (2.8,3.6] (2,2.8]   (2,2.8]   (2.8,3.6] (2,2.8]  
##  [64] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8]   (2,2.8]   (2,2.8]  
##  [71] (2.8,3.6] (2,2.8]   (2,2.8]   (2,2.8]   (2.8,3.6] (2.8,3.6] (2,2.8]  
##  [78] (2.8,3.6] (2.8,3.6] (2,2.8]   (2,2.8]   (2,2.8]   (2,2.8]   (2,2.8]  
##  [85] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8]   (2.8,3.6] (2,2.8]   (2,2.8]  
##  [92] (2.8,3.6] (2,2.8]   (2,2.8]   (2,2.8]   (2.8,3.6] (2.8,3.6] (2.8,3.6]
##  [99] (2,2.8]   (2,2.8]   (2.8,3.6] (2,2.8]   (2.8,3.6] (2.8,3.6] (2.8,3.6]
## [106] (2.8,3.6] (2,2.8]   (2.8,3.6] (2,2.8]   (2.8,3.6] (2.8,3.6] (2,2.8]  
## [113] (2.8,3.6] (2,2.8]   (2,2.8]   (2.8,3.6] (2.8,3.6] (3.6,4.4] (2,2.8]  
## [120] (2,2.8]   (2.8,3.6] (2,2.8]   (2,2.8]   (2,2.8]   (2.8,3.6] (2.8,3.6]
## [127] (2,2.8]   (2.8,3.6] (2,2.8]   (2.8,3.6] (2,2.8]   (3.6,4.4] (2,2.8]  
## [134] (2,2.8]   (2,2.8]   (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6]
## [141] (2.8,3.6] (2.8,3.6] (2,2.8]   (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8]  
## [148] (2.8,3.6] (2.8,3.6] (2.8,3.6]
## Levels: (2,2.8] (2.8,3.6] (3.6,4.4]

Other methods (equal frequency, k-means clustering, etc.)

library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
discretize(iris$Petal.Width, method="interval", categories=3)
## Warning in discretize(iris$Petal.Width, method = "interval", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default method
## is now frequency!
##   [1] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
##   [8] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
##  [15] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
##  [22] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
##  [29] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
##  [36] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
##  [43] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
##  [50] [0.1,0.9) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
##  [57] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
##  [64] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
##  [71] [1.7,2.5] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
##  [78] [1.7,2.5] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
##  [85] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
##  [92] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
##  [99] [0.9,1.7) [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [106] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [113] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [120] [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [127] [1.7,2.5] [1.7,2.5] [1.7,2.5] [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [134] [0.9,1.7) [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [141] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [148] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## attr(,"discretized:breaks")
## [1] 0.1 0.9 1.7 2.5
## attr(,"discretized:method")
## [1] interval
## Levels: [0.1,0.9) [0.9,1.7) [1.7,2.5]
discretize(iris$Petal.Width, method="frequency", categories=3)
## Warning in discretize(iris$Petal.Width, method = "frequency", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default method
## is now frequency!
##   [1] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
##   [7] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
##  [13] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
##  [19] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
##  [25] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
##  [31] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
##  [37] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
##  [43] [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867) [0.1,0.867)
##  [49] [0.1,0.867) [0.1,0.867) [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6)
##  [55] [0.867,1.6) [0.867,1.6) [1.6,2.5]   [0.867,1.6) [0.867,1.6) [0.867,1.6)
##  [61] [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6)
##  [67] [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [1.6,2.5]   [0.867,1.6)
##  [73] [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [1.6,2.5]  
##  [79] [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [1.6,2.5]  
##  [85] [0.867,1.6) [1.6,2.5]   [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6)
##  [91] [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6)
##  [97] [0.867,1.6) [0.867,1.6) [0.867,1.6) [0.867,1.6) [1.6,2.5]   [1.6,2.5]  
## [103] [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]  
## [109] [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]  
## [115] [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [0.867,1.6)
## [121] [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]  
## [127] [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]  
## [133] [1.6,2.5]   [0.867,1.6) [0.867,1.6) [1.6,2.5]   [1.6,2.5]   [1.6,2.5]  
## [139] [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]  
## [145] [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]   [1.6,2.5]  
## attr(,"discretized:breaks")
## [1] 0.1000000 0.8666667 1.6000000 2.5000000
## attr(,"discretized:method")
## [1] frequency
## Levels: [0.1,0.867) [0.867,1.6) [1.6,2.5]
discretize(iris$Petal.Width, method="cluster", categories=3)
## Warning in discretize(iris$Petal.Width, method = "cluster", categories = 3):
## Parameter categories is deprecated. Use breaks instead! Also, the default method
## is now frequency!
##   [1] [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792) 
##   [6] [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792) 
##  [11] [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792) 
##  [16] [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792) 
##  [21] [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792) 
##  [26] [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792) 
##  [31] [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792) 
##  [36] [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792) 
##  [41] [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792) 
##  [46] [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792)  [0.1,0.792) 
##  [51] [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71)
##  [56] [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71)
##  [61] [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71)
##  [66] [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71)
##  [71] [1.71,2.5]   [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71)
##  [76] [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71)
##  [81] [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71)
##  [86] [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71)
##  [91] [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71)
##  [96] [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71) [0.792,1.71)
## [101] [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [1.71,2.5]  
## [106] [1.71,2.5]   [0.792,1.71) [1.71,2.5]   [1.71,2.5]   [1.71,2.5]  
## [111] [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [1.71,2.5]  
## [116] [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [0.792,1.71)
## [121] [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [1.71,2.5]  
## [126] [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [0.792,1.71)
## [131] [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [0.792,1.71) [0.792,1.71)
## [136] [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [1.71,2.5]  
## [141] [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [1.71,2.5]  
## [146] [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [1.71,2.5]   [1.71,2.5]  
## attr(,"discretized:breaks")
## [1] 0.1000000 0.7915185 1.7054750 2.5000000
## attr(,"discretized:method")
## [1] cluster
## Levels: [0.1,0.792) [0.792,1.71) [1.71,2.5]
ggplot(iris, aes(Petal.Width)) + geom_histogram() +
  geom_vline(xintercept =
      discretize(iris$Petal.Width, method="interval", breaks = 3, onlycuts = TRUE),
    color = "blue") +
  labs(title = "Discretization: interval", subtitle = "Blue lines are boundaries")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(iris, aes(Petal.Width)) + geom_histogram() +
  geom_vline(xintercept =
      discretize(iris$Petal.Width, method="frequency", breaks = 3, onlycuts = TRUE),
    color = "blue") +
  labs(title = "Discretization: frequency", subtitle = "Blue lines are boundaries")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(iris, aes(Petal.Width)) + geom_histogram() +
  geom_vline(xintercept =
      discretize(iris$Petal.Width, method="cluster", breaks = 3, onlycuts = TRUE),
    color = "blue") +
  labs(title = "Discretization: cluster", subtitle = "Blue lines are boundaries")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Standardize data (Z-score)

Standardize the scale of features to make them comparable. For each column the mean is subtracted (centering) and it is divided by the standard deviation (scaling). Now most values should be in [-3,3].

iris.scaled <- iris %>% mutate_if(is.numeric, function(x) as.vector(scale(x)))
iris.scaled
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1       -0.898      1.02          -1.34       -1.31 setosa 
##  2       -1.14      -0.132         -1.34       -1.31 setosa 
##  3       -1.38       0.327         -1.39       -1.31 setosa 
##  4       -1.50       0.0979        -1.28       -1.31 setosa 
##  5       -1.02       1.25          -1.34       -1.31 setosa 
##  6       -0.535      1.93          -1.17       -1.05 setosa 
##  7       -1.50       0.786         -1.34       -1.18 setosa 
##  8       -1.02       0.786         -1.28       -1.31 setosa 
##  9       -1.74      -0.361         -1.34       -1.31 setosa 
## 10       -1.14       0.0979        -1.28       -1.44 setosa 
## # … with 140 more rows
summary(iris.scaled)
##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

Proximities: Similarities and distances

Note: R actually only uses dissimilarities/distances.

Minkovsky distances

iris_sample <- iris.scaled %>% select(-Species) %>% slice(1:5)
iris_sample
## # A tibble: 5 x 4
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
##          <dbl>       <dbl>        <dbl>       <dbl>
## 1       -0.898      1.02          -1.34       -1.31
## 2       -1.14      -0.132         -1.34       -1.31
## 3       -1.38       0.327         -1.39       -1.31
## 4       -1.50       0.0979        -1.28       -1.31
## 5       -1.02       1.25          -1.34       -1.31

Calculate distances matrices between the first 5 flowers (use only the 4 numeric columns).

dist(iris_sample, method="euclidean")
##           1         2         3         4
## 2 1.1722914                              
## 3 0.8427840 0.5216255                    
## 4 1.0999999 0.4325508 0.2829432          
## 5 0.2592702 1.3818560 0.9882608 1.2459861
dist(iris_sample, method="manhattan")
##           1         2         3         4
## 2 1.3886674                              
## 3 1.2279853 0.7570306                    
## 4 1.5781768 0.6483657 0.4634868          
## 5 0.3501915 1.4973323 1.3366502 1.6868417
dist(iris_sample, method="maximum")
##           1         2         3         4
## 2 1.1471408                              
## 3 0.6882845 0.4588563                    
## 4 0.9177126 0.3622899 0.2294282          
## 5 0.2294282 1.3765690 0.9177126 1.1471408

Note: Don’t forget to scale the data if the ranges are very different!

Distances for binary data (Jaccard and Hamming)

b <- rbind(
  c(0,0,0,1,1,1,1,0,0,1),
  c(0,0,1,1,1,0,0,1,0,0)
  )
b
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]    0    0    0    1    1    1    1    0    0     1
## [2,]    0    0    1    1    1    0    0    1    0     0

Jaccard index

Jaccard index is a similarity measure so R reports 1-Jaccard

dist(b, method = "binary")
##           1
## 2 0.7142857

Hamming distance

Hamming distance is the number of mis-matches (equivalent to Manhattan distance on 0-1 data and also the squared Euclidean distance).

dist(b, method = "manhattan")
##   1
## 2 5
dist(b, method = "euclidean")^2
##   1
## 2 5

Distances for mixed data

Gower’s distance

Works with mixed data

data <- data.frame(
  height= c(      160,    185,    170),
  weight= c(       52,     90,     75),
  sex=    c( "female", "male", "male")
)
data
##   height weight    sex
## 1    160     52 female
## 2    185     90   male
## 3    170     75   male

Note: Nominal variables need to be factors!

data <- data %>% mutate_if(is.character, factor)

library(proxy)
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
d_Gower <- dist(data, method="Gower")
d_Gower
##           1         2
## 2 1.0000000          
## 3 0.6684211 0.3315789

Note: Gower’s distance automatically scales, so no need to scale the data first.

Using Euclidean distance with mixed data

Sometimes methods (e.g., k-means) only can use Euclidean distance. In this case, nominal features can be converted into 0-1 dummy variables. Euclidean distance on these will result in a usable distance measure.

Create dummy variables

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:sampling':
## 
##     cluster
## The following object is masked from 'package:purrr':
## 
##     lift
data_dummy <- predict(dummyVars(~., data), data)
data_dummy
##   height weight sex.female sex.male
## 1    160     52          1        0
## 2    185     90          0        1
## 3    170     75          0        1

Since sex has now two columns, we need to weight them by 1/2 after scaling.

weight <- matrix(c(1,1,1/2,1/2), ncol = 4, nrow = nrow(data_dummy), byrow = TRUE)
data_dummy_scaled <- scale(data_dummy) * weight

d_dummy <- dist(data_dummy_scaled)
d_dummy
##          1        2
## 2 3.064169         
## 3 1.890931 1.426621

Distance is (mostly) consistent with Gower’s distance (other than that Gower’s distance is scaled between 0 and 1).

plot(d_dummy, d_Gower, xlab = "Euclidean w/dummy", ylab = "Gower")

Additional proximity measures available in package proxy

library(proxy)
names(pr_DB$get_entries())
##  [1] "Jaccard"         "Kulczynski1"     "Kulczynski2"     "Mountford"      
##  [5] "Fager"           "Russel"          "simple matching" "Hamman"         
##  [9] "Faith"           "Tanimoto"        "Dice"            "Phi"            
## [13] "Stiles"          "Michael"         "Mozley"          "Yule"           
## [17] "Yule2"           "Ochiai"          "Simpson"         "Braun-Blanquet" 
## [21] "cosine"          "eJaccard"        "eDice"           "correlation"    
## [25] "Chi-squared"     "Phi-squared"     "Tschuprow"       "Cramer"         
## [29] "Pearson"         "Gower"           "Euclidean"       "Mahalanobis"    
## [33] "Bhjattacharyya"  "Manhattan"       "supremum"        "Minkowski"      
## [37] "Canberra"        "Wave"            "divergence"      "Kullback"       
## [41] "Bray"            "Soergel"         "Levenshtein"     "Podani"         
## [45] "Chord"           "Geodesic"        "Whittaker"       "Hellinger"      
## [49] "fJaccard"

Relationship between features

Correlation (for ratio/interval scaled features)

Pearson correlation between features (columns)

cor(iris[,1:4])
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1175698    0.8717538   0.8179411
## Sepal.Width    -0.1175698   1.0000000   -0.4284401  -0.3661259
## Petal.Length    0.8717538  -0.4284401    1.0000000   0.9628654
## Petal.Width     0.8179411  -0.3661259    0.9628654   1.0000000
ggplot(iris, aes(Petal.Length, Petal.Width)) + geom_point()

cor(iris$Petal.Length, iris$Petal.Width)
## [1] 0.9628654
cor.test(iris$Petal.Length, iris$Petal.Width)
## 
##  Pearson's product-moment correlation
## 
## data:  iris$Petal.Length and iris$Petal.Width
## t = 43.387, df = 148, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9490525 0.9729853
## sample estimates:
##       cor 
## 0.9628654
ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point()

cor(iris$Sepal.Length, iris$Sepal.Width)
## [1] -0.1175698
cor.test(iris$Sepal.Length, iris$Sepal.Width)
## 
##  Pearson's product-moment correlation
## 
## data:  iris$Sepal.Length and iris$Sepal.Width
## t = -1.4403, df = 148, p-value = 0.1519
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.27269325  0.04351158
## sample estimates:
##        cor 
## -0.1175698

Correlation between objects (transpose matrix first)

cc <- cor(t(iris[,1:4]))
dim(cc)
## [1] 150 150
cc[1:10,1:10]
##            [,1]      [,2]      [,3]      [,4]      [,5]      [,6]      [,7]
##  [1,] 1.0000000 0.9959987 0.9999739 0.9981685 0.9993473 0.9995861 0.9988112
##  [2,] 0.9959987 1.0000000 0.9966071 0.9973966 0.9922327 0.9935919 0.9907206
##  [3,] 0.9999739 0.9966071 1.0000000 0.9983335 0.9990611 0.9993773 0.9984377
##  [4,] 0.9981685 0.9973966 0.9983335 1.0000000 0.9967188 0.9978326 0.9961394
##  [5,] 0.9993473 0.9922327 0.9990611 0.9967188 1.0000000 0.9998833 0.9999140
##  [6,] 0.9995861 0.9935919 0.9993773 0.9978326 0.9998833 1.0000000 0.9997226
##  [7,] 0.9988112 0.9907206 0.9984377 0.9961394 0.9999140 0.9997226 1.0000000
##  [8,] 0.9995381 0.9971181 0.9996045 0.9995456 0.9985032 0.9991788 0.9979521
##  [9,] 0.9980766 0.9985463 0.9983561 0.9998333 0.9960309 0.9972157 0.9952140
## [10,] 0.9965520 0.9990329 0.9969856 0.9993068 0.9937612 0.9952606 0.9927272
##            [,8]      [,9]     [,10]
##  [1,] 0.9995381 0.9980766 0.9965520
##  [2,] 0.9971181 0.9985463 0.9990329
##  [3,] 0.9996045 0.9983561 0.9969856
##  [4,] 0.9995456 0.9998333 0.9993068
##  [5,] 0.9985032 0.9960309 0.9937612
##  [6,] 0.9991788 0.9972157 0.9952606
##  [7,] 0.9979521 0.9952140 0.9927272
##  [8,] 1.0000000 0.9994062 0.9983737
##  [9,] 0.9994062 1.0000000 0.9997398
## [10,] 0.9983737 0.9997398 1.0000000
library("seriation") # for pimage
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
## 
## Attaching package: 'seriation'
## The following object is masked from 'package:lattice':
## 
##     panel.lines
pimage(cc, main = "Correlation between objects")

Convert correlations into a dissimilarities

d <- as.dist(1-abs(cc))
pimage(d, main = "Dissimilaries between objects")

Rank correlation (for ordinal features)

convert to ordinal variables with cut (see ? cut) into ordered factors with three levels

iris_ord <- iris %>% mutate_if(is.numeric,
  function(x) cut(x, 3, labels = c("short", "medium", "long"), ordered = TRUE))

iris_ord
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##    <ord>        <ord>       <ord>        <ord>       <fct>  
##  1 short        medium      short        short       setosa 
##  2 short        medium      short        short       setosa 
##  3 short        medium      short        short       setosa 
##  4 short        medium      short        short       setosa 
##  5 short        medium      short        short       setosa 
##  6 short        long        short        short       setosa 
##  7 short        medium      short        short       setosa 
##  8 short        medium      short        short       setosa 
##  9 short        medium      short        short       setosa 
## 10 short        medium      short        short       setosa 
## # … with 140 more rows
summary(iris_ord)
##  Sepal.Length Sepal.Width Petal.Length Petal.Width       Species  
##  short :59    short :47   short :50    short :50   setosa    :50  
##  medium:71    medium:88   medium:54    medium:54   versicolor:50  
##  long  :20    long  :15   long  :46    long  :46   virginica :50
head(iris_ord$Sepal.Length)
## [1] short short short short short short
## Levels: short < medium < long

Kendall’s tau rank correlation coefficient

cor(sapply(iris_ord[,1:4], xtfrm), method="kendall")
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1437985    0.7418595   0.7295139
## Sepal.Width    -0.1437985   1.0000000   -0.3298796  -0.3154474
## Petal.Length    0.7418595  -0.3298796    1.0000000   0.9198290
## Petal.Width     0.7295139  -0.3154474    0.9198290   1.0000000

Spearman’s rho

cor(sapply(iris_ord[,1:4], xtfrm), method="spearman")
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1569659    0.7937613   0.7843406
## Sepal.Width    -0.1569659   1.0000000   -0.3662775  -0.3517262
## Petal.Length    0.7937613  -0.3662775    1.0000000   0.9399038
## Petal.Width     0.7843406  -0.3517262    0.9399038   1.0000000

Note: unfortunately we have to transform the ordered factors into numbers representing the order with xtfrm first.

Compare to the Pearson correlation on the original data

cor(iris[,1:4])
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1175698    0.8717538   0.8179411
## Sepal.Width    -0.1175698   1.0000000   -0.4284401  -0.3661259
## Petal.Length    0.8717538  -0.4284401    1.0000000   0.9628654
## Petal.Width     0.8179411  -0.3661259    0.9628654   1.0000000

Relationship between nominal and ordinal features

Is sepal length and species related? Use cross tabulation

tbl <- with(iris_ord, table(Sepal.Length, Species))
tbl
##             Species
## Sepal.Length setosa versicolor virginica
##       short      47         11         1
##       medium      3         36        32
##       long        0          3        17
# this is a little more involved using tidyverse
iris_ord %>% select(c("Species", "Sepal.Length")) %>%
  pivot_longer(cols = "Sepal.Length") %>%
  group_by(Species, value) %>% count() %>% ungroup() %>%
  pivot_wider(names_from = Species, values_from = n)
## # A tibble: 3 x 4
##   value  setosa versicolor virginica
##   <ord>   <int>      <int>     <int>
## 1 short      47         11         1
## 2 medium      3         36        32
## 3 long       NA          3        17

Test of Independence: Pearson’s chi-squared test is performed with the null hypothesis that the joint distribution of the cell counts in a 2-dimensional contingency table is the product of the row and column marginals. (h0 is independence)

chisq.test(tbl)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 111.63, df = 4, p-value < 2.2e-16

Using xtabs instead

x <- xtabs(~Sepal.Length + Species, data=iris_ord)
x
##             Species
## Sepal.Length setosa versicolor virginica
##       short      47         11         1
##       medium      3         36        32
##       long        0          3        17
summary(x)
## Call: xtabs(formula = ~Sepal.Length + Species, data = iris_ord)
## Number of cases in table: 150 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 111.63, df = 4, p-value = 3.262e-23

Groupwise averages

iris %>% group_by(Species) %>% summarize_at(vars(Sepal.Length), mean)
## # A tibble: 3 x 2
##   Species    Sepal.Length
##   <fct>             <dbl>
## 1 setosa             5.01
## 2 versicolor         5.94
## 3 virginica          6.59
iris %>% group_by(Species) %>% summarize_all(mean)
## # A tibble: 3 x 5
##   Species    Sepal.Length Sepal.Width Petal.Length Petal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa             5.01        3.43         1.46       0.246
## 2 versicolor         5.94        2.77         4.26       1.33 
## 3 virginica          6.59        2.97         5.55       2.03

Density estimation

Just plotting the data is not very helpful

ggplot(iris, aes(Petal.Length, 1:150)) + geom_point()

Histograms work better

ggplot(iris, aes(Petal.Length)) +
  geom_histogram() +
  geom_rug(alpha = 1/10)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Kernel density estimate KDE

ggplot(iris, aes(Petal.Length)) +
  geom_rug(alpha = 1/10) +
  geom_density()

Plot 2d kernel density estimate

ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
  geom_jitter() +
  geom_density2d()

ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
  geom_bin2d(bins = 10) +
  geom_jitter(color = "red")

ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
  geom_hex(bins = 10) +
  geom_jitter(color = "red")